We currently assign sensitivities by tail-assignment, however, there may be cases with distinctive multimodal distributions that are indicative of distinctive classes of response.
We'll begin by testing the method on beatAML data where we have much greater number of observations, and therefore more well defined distributions.
$$ p(x) = \frac{1}{\sqrt{ 2 \pi \sigma^2 }} e^{ - \frac{ (x - \mu)^2 } {2 \sigma^2} } $$%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
return false;
}
import pandas as pd
import numpy as np
from sklearn.mixture import GMM
import seaborn as sbn
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings("ignore")
aml_aucs = pd.read_csv('./../data/beatAML_aucs.csv')[['inhibitor','lab_id','auc']].drop_duplicates()
#aml_aucs = pd.read_csv('./../data/beatAML_AUCs_subset.csv')
aml_aucs.head()
def add_normal_plot(mu, s, weight, c, ax, auc_max=300):
'''
mu = mean
s = standard deviation
ax = matplotlib axes to add to
c = color {[r b g c y ...]} <str>
'''
x = np.arange(0, auc_max, 1)
y = (1/(2*np.pi*s**2)**(0.5))*np.exp( - (x-mu)**2/(2*s**2) ) * weight
ax = ax.plot(x,y, color=c, label='mean: %.1f, std: %.1f' %(mu, s))
def get_color():
'''
'''
for c in ['r','b','g','c','y']:
yield c
gen = get_color()
gen.__next__()
def test_multimodal_fits(X, ntests=10, kmax=5, plot=True):
'''
'''
res = {x:[] for x in ['k', 'aic', 'bic']}
for k in range(1,kmax):
for i in range(ntests):
#print('k: %d' %k)
gmm = GMM(n_components=k, n_init=1)
gmm.fit(X)
res['k'].append( k )
res['aic'].append( gmm.aic(X) )
res['bic'].append( gmm.bic(X) )
res = pd.DataFrame( res )
if plot:
best_k = res[res.bic == np.min(res.bic)].k.unique()[0]
gmm_best = GMM(n_components=best_k,n_init=20)
gmm_best.fit(X)
P = gmm_best.predict(X)
nbins = 50
bin_ = np.arange(0,np.max(X),np.max(X)/nbins)
scalar_to_make_pretty = 0.25 # since our fitted Gaussians are normalized to their weights, they appear smaller
weights_ = scalar_to_make_pretty/len(X)
f, axs = plt.subplots(1,3,figsize=(15,5))
sbn.distplot(X, bins=bin_, ax=axs[0]).set_title('AUC distribution')
sbn.scatterplot(x='k', y='bic', alpha=0.3, data=res, ax=axs[1]).set_title('BIC vs K')
clas = 0
for weight, mean, covars, c in zip(gmm_best.weights_, gmm_best.means_, gmm_best.covars_, get_color()):
sbn.distplot(X[P==clas], bins=bin_, kde=False, color=c, ax=axs[2], label='AUC', hist_kws={'weights': [weights_]*len(X[P==clas])})
add_normal_plot(mean[0], (covars[0])**0.5, weight, c, axs[2], auc_max=np.max(X))
clas+=1
axs[2].set_title('Optimal GMM fit')
plt.legend()
plt.suptitle(inhib)
print('Number of assays (aucs): %d' %len(X))
print('Optimal K: %d [BIC=%.1f]' %(best_k, np.min(res.bic)))
print('GMM fit:\n\tMixture Weights: %r\n\tMeans: %r\n\tVariances: %r' %(gmm_best.weights_.ravel(), gmm_best.means_.ravel(), gmm_best.covars_.ravel()))
print('Class counts: %r' %['class %d: %d' %(cl, len(X[P==cl])) for cl in list(set(P))])
plt.show()
for inhib in aml_aucs.inhibitor.unique():
inhib_dat = aml_aucs[aml_aucs.inhibitor == inhib]
if inhib_dat.shape[0] > 400:
print('------------------------------------------------')
print('Inhibitor: %s' %inhib)
print('------------------------------------------------')
AUCS = inhib_dat.auc.values.reshape(-1,1)
test_multimodal_fits(AUCS, ntests=10, kmax=6, plot=True)
okay_data = ['YM-155', 'Vandetanib (ZD6474)', 'Trametinib (GSK1120212)', 'Sunitinib','Sorafenib', 'Selumetinib (AZD6244)', 'Nilotinib', 'JAK Inhibitor I', 'Pazopanib (GW786034)', 'Elesclomol', 'Dasatinib']
dat = aml_aucs[aml_aucs.inhibitor.isin(okay_data)]
dat.to_csv('./../data/beatAML_AUCs_subset.csv')